Thu Jun 19 16:51:53 2025
Import the Seurat object: previously processed, annotated with cell labels, assigned gene-set scores using AUCell (cancersea state signature scores), and cells that contain EGFP plasmid fragments have been labeled.
seu <- readRDS(seurat_file)
# simplify cell type labels (group rare ones into "other")
seu$celltype <- ifelse(seu$celltype %in% names(which(table(seu$celltype) < 100)),
'other', seu$celltype)
# update group labels
conditions <- list('PoolA' = 'Recovery',
'PoolB' = 'DSS',
'PoolC' = 'Untreated')
seu$condition <- paste(conditions[seu$sample])
egfp_cells <- colnames(seu)[seu$egfp_status == 'EGFP_POS']
From the single-cell sequencing of samples under 3 conditions, we managed to pick up about 1.5% of cells to contain at least one unique read originating from the EGFP-plasmid. Those cells are supposed to contain NFKB promoters thus, EGFP-positive cells are likely to contain increased NFKB activity.
knitr::kable(table(seu$egfp_status, seu$condition),
caption = 'Number of cells per condition categorized by EGFP status')
| DSS | Recovery | Untreated | |
|---|---|---|---|
| EGFP_NEG | 5257 | 7153 | 6561 |
| EGFP_POS | 76 | 116 | 116 |
Note: Black dots represent cells for which we could pick up at least one EGFP-plasmid fragment.
p <- DimPlot(seu, group.by = 'celltype')
df <- p[[1]]$data
p + geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2),
color = 'black', size = 1)
p <- DimPlot(seu, group.by = 'condition')
df <- p[[1]]$data
p + geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2),
color = 'black', size = 1)
p <- DimPlot(seu, group.by = 'celltype', split.by = 'condition')
df <- p[[1]]$data
p + geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2),
color = 'black', size = 1)
Comparison of cancer state signature scores in egfp+ and egfp- cells in different conditions
sigs <- colnames(seu@meta.data)[10:23]
dt <- data.table(seu@meta.data)
mdt <- melt.data.table(dt, measure.vars = sigs)
ggplot(mdt, aes(x = value, y = variable)) +
geom_density_ridges(aes(fill = egfp_status), alpha = 0.5) +
scale_fill_manual(values = c('blue', 'red')) +
labs(y = 'Signature', x = 'AUCell score')
ggplot(mdt, aes(x = value, y = variable)) +
geom_density_ridges(aes(fill = condition), alpha = 0.5) +
labs(y = 'Signature', x = 'AUCell score')
ggplot(mdt, aes(x = value, y = variable)) +
geom_density_ridges(aes(fill = egfp_status), alpha = 0.5) +
scale_fill_manual(values = c('blue', 'red')) +
labs(y = 'Signature', x = 'AUCell score') +
facet_grid(~ condition)
Now, we focus on epithelial cells (cells annotated as epithelial and also are cdh1+) to look for NKFB activity signatures by comparing EGFP+ cells iwth EGFP- cells in different conditions.
For each gene, we check ratio of epithelial cells expressing the marker in different conditions; split by egfp status
qPCR_markers <- c('Egfp', 'Lgr5', 'Chga', 'Muc2', 'Dclk1', 'Tnfaip3', 'Noxa1',
'Tnf', 'Bcl2l1', 'Bcl2')
M <- GetAssayData(seu)
epithelial_cells <- intersect(colnames(seu)[seu$celltype == 'Epithelial cells'],
names(which(M['Cdh1',] > 0)))
# setdiff(intersect(colnames(seu)[seu$celltype == 'Epithelial cells'],
# names(which(M['Cdh1',] > 0))),
# names(which(M['Ptprc',] > 0)))
seu_epi <- seu[,epithelial_cells]
dt <- data.table(seu_epi@meta.data, keep.rownames = T)
dt <- cbind(dt, sapply(qPCR_markers, function(x) {
M[x, dt$rn]
}))
df <- do.call(rbind, lapply(qPCR_markers, function(x) {
s <- dt[,list('percent_expressed' = round(sum(get(x) > 0)/length(get(x)) * 100, 1),
'avg_expression' = mean(get(x))),by = c('condition', 'egfp_status')]
s$gene <- x
return(s)
}))
df$egfp_status <- factor(df$egfp_status, levels = c('EGFP_POS', 'EGFP_NEG'))
df$condition <- factor(df$condition, levels = c('Untreated', 'DSS', 'Recovery'))
ggplot(df, aes(x = egfp_status, y = percent_expressed)) +
geom_bar(stat = 'identity', aes(fill = condition), position = 'dodge') +
facet_wrap(~ gene, scales = 'free', nrow = 2) +
scale_fill_brewer(type = 'qual', palette = 6)
# find markers for egfp+/egpf- in cdh1+ epithelial cells
markers <- sapply(simplify = F, unique(seu_epi$condition), function(x) {
message(date(), " => computing markers for condition ",x)
dt <- data.table(FindMarkers(seu_epi[,seu_epi$condition == x],
min.pct = 0.1,
logFoldChange = 0.25,
test.use = 'wilcox',
ident.1 = 'EGFP_POS',
ident.2 = 'EGFP_NEG',
only.pos = TRUE,
group.by = 'egfp_status'),
keep.rownames = T)
dt$condition <- x
return(dt)
})
markers <- do.call(rbind, markers)
m <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:21], by = .(condition)][!is.na(rn)][rn != 'Egfp']
p <- DotPlot(seu_epi[,seu_epi$egfp_status == 'EGFP_POS'],
features = unique(m$rn),
group.by = 'condition', scale = F) + coord_flip()
df <- p$data
df$id <- factor(df$id, levels = c('Untreated', 'DSS', 'Recovery'))
ggplot(df, aes(x = features.plot, y = id)) +
geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
labs(color = "Average\nExpression", size = "Percent\nExpression") +
# scale_color_gradient(low = '#672976', high = 'yellow') +
scale_color_gradientn(colors = colorRampPalette(c("#313e93", "#b252a2", "#f4a16f", "#eeea59"))(50)) +
scale_size(range = c(0, 6)) +
theme(axis.title = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, face = 'italic'),
axis.text = element_text(size = 14, family = 'Arial'),
legend.title = element_text(family = 'Arial'))
# subset of genes of interest
# I don't know how these are selected by Marina
goi <- c('Steap4', 'Smim19', 'Cox17', 'Nfkbib', 'Man2a2', 'Sept10', 'Acot13', 'Ndufv2', 'Mrps15', 'Aqp8', 'H2-Eb1', 'Ly6g', 'Atf3')
p <- DotPlot(seu_epi[,seu_epi$egfp_status == 'EGFP_POS'],
features = goi,
group.by = 'condition', scale = F) + coord_flip()
df <- p$data
df$id <- factor(df$id, levels = c('Untreated', 'DSS', 'Recovery'))
ggplot(df, aes(x = features.plot, y = id)) +
geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
labs(color = "Average\nExpression", size = "Percent\nExpression") +
scale_color_gradientn(colors = colorRampPalette(c("#313e93", "#b252a2", "#f4a16f", "#eeea59"))(50)) +
scale_size(range = c(0, 6)) +
theme(axis.title = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, face = 'italic'),
axis.text = element_text(size = 14, family = 'Arial'),
legend.title = element_text(family = 'Arial'))
Found top markers expressed in at least 15% of EGFP+ cells and filtered for padj < 0.1
colorPal <- colorRampPalette(c("darkorchid4", "yellow"))(50)
M <- GetAssayData(seu_epi)
#top <- markers[pct.1 > 0.15][p_val_adj < 0.1]
top <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:20], by = .(condition)][!is.na(rn)]
B <- get_basis_matrix(t(as.matrix(M[unique(top$rn),])),
as.factor(paste(seu_epi$condition, seu_epi$egfp_status)))
sample_order <- c('Untreated EGFP_POS', 'Untreated EGFP_NEG',
'DSS EGFP_POS', 'DSS EGFP_NEG',
'Recovery EGFP_POS', 'Recovery EGFP_NEG')
pheatmap::pheatmap(t(B[sample_order,]), scale = 'row', cluster_cols = F,
cellwidth = 20, fontsize_row = 9, color = colorPal,
gaps_col = c(2,4))
# import nfkb target genes; (for marker analysis )
nfkb <- readLines(file.path(datadir, 'nfkb_targets_mouse.txt'))
dt <- melt.data.table(markers, id.vars = c('rn', 'condition'), measure.vars = c('pct.1', 'pct.2'))
dt$variable <- ifelse(dt$variable == 'pct.1', 'EGFP_POS', 'EGFP_NEG')
dt$group <- paste(dt$condition, dt$variable)
dtc <- dcast.data.table(dt, rn ~ group, value.var = 'value')
dtc[is.na(dtc)] <- 0
m <- as.matrix(data.frame(dtc[,-1], row.names = dtc[[1]], check.names = F))
genes <- unique(top$rn)
ann_row <- data.frame('condition' = markers[rn %in% genes, .SD[which.min(p_val)],by = rn][match(genes, rn)]$condition,
#'is_nfkb_target' = as.factor(genes %in% nfkb),
row.names = genes)
pheatmap::pheatmap(m[genes, sample_order], scale = 'row',
color = colorPal, cellwidth = 20,
main = 'Ratio of cells expressing top markers',
annotation_row = ann_row,
gaps_col = c(2, 4),
cluster_cols = F,
fontsize = 9, cutree_rows = 3)
We want to apply some things learned from the single-cell data to the bulk RNA-seq data we have analyzed before.
# get RUVSeq normalized counts from bulk rnaseq data
sample_sheet <- read.csv(file.path(datadir, 'rnaseq', 'sample_sheet.csv'))
counts <- read.table(file.path(datadir, 'rnaseq', 'feature_counts', 'raw_counts', 'hisat2', 'counts.tsv'), header = T, check.names = F)
colData <- data.frame(sample_sheet[,-1], row.names = sample_sheet$name)
set <- newSeqExpressionSet(counts = as.matrix(counts),
phenoData = colData)
differences <- makeGroups(colData$sample_type)
set_s <- RUVs(set, unique(rownames(set)),
k=10, differences) #all genes
norm_counts <- log(normCounts(set_s)+1)
Make a heatmap of NFKB target genes (exclusively expressed in Cdh1+ epithelial cells only )
# subset norm_counts for nfkb genes
ids <- readRDS(file.path(datadir, 'ensmusg2name.RDS'))
nfkb_ids <- ids[match(nfkb, name)]$id
M_nfkb <- norm_counts[nfkb_ids,]
# convert to gene names
rownames(M_nfkb) <- ids[match(rownames(M_nfkb), id)]$name
M <- GetAssayData(seu_epi)
nonepi_cells <- colnames(seu)[seu$celltype != 'Epithelial cells']
M_nonepi <- GetAssayData(seu[,nonepi_cells])
genes <- intersect(rownames(M), rownames(M_nfkb))
# get genes expressed in at least 1% of epithelial cells
genes <- names(which(apply(M[genes,], 1, function(x) (sum(x > 0) / length(x)) * 100) > 0.5))
# remove genes expressed in more than 5% of the non epithelial cells
genes <- names(which(apply(M_nonepi[genes,], 1, function(x) (sum(x > 0) / length(x)) * 100) < 3))
# keep activated / repressed samples only
genes <- names(which(rowSums(M_nfkb[genes,]) > 0))
colData$nfkb <- gsub(".+_", "", colData$sample_type)
colData$dss <- gsub("_.+", "", colData$sample_type)
pheatmap::pheatmap(M_nfkb[genes, ], scale = 'row',
annotation_col = colData[,c('nfkb','dss'),drop=F],
cluster_cols = F,
cellwidth = 15,
color = rev(colorRampPalette(RColorBrewer::brewer.pal(11, "RdYlBu"))(10)),
gaps_col = c(8, 16),
cutree_rows = 5,
show_colnames = F)
We score bulk rnaseq samples using different gene signatures.
score_gene_sets <- function(exprs, genesets) {
rankData <- singscore::rankGenes(exprs)
s <- pbapply::pbsapply(genesets, function(x) {
x <- intersect(x, rownames(rankData))
singscore::simpleScore(rankData, upSet = x)[['TotalScore']]
})
rownames(s) <- colnames(exprs)
return(s)
}
find_differential_signatures <- function(scores, samplesA, samplesB) {
groupA <- intersect(colnames(scores), samplesA)
groupB <- intersect(colnames(scores), samplesB)
message("Comparing ",length(groupA), " samples (A) with ",length(groupB), " samples (B)")
stats <- do.call(rbind, lapply(rownames(scores), function(x) {
a <- scores[x, groupA]
b <- scores[x, groupB]
data.table('signature' = x, 'groupA' = mean(a),
'groupB' = mean(b),
'pval' = wilcox.test(a, b)[['p.value']])
}))
stats$padj <- p.adjust(stats$pval, method = 'BH')
stats <- stats[order(pval)]
return(stats)
}
gex <- data.frame(norm_counts, check.names = F)
# convert to gene names
ids <- readRDS(file.path(datadir, 'ensmusg2name.RDS'))
gex$name <- ids[match(rownames(gex), id)]$name
gex <- gex[!is.na(gex$name),]
gex <- gex[!duplicated(gex$name),]
rownames(gex) <- gex$name
gex$name <- NULL
For each condition, we found the signature genes that distinguish egfp+ from egfp- cells. Now, we score the bulk samples by these signatures.
m <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:51], by = .(condition)][rn != 'Egfp']
egfp_signatures <- sapply(simplify = F, unique(m$condition), function(x) m[condition == x]$rn)
egfp_signature_scores <- score_gene_sets(gex, egfp_signatures)
egfp_signature_scores <- egfp_signature_scores[,c('Untreated', 'DSS', 'Recovery')]
pheatmap::pheatmap(t(egfp_signature_scores), cluster_cols = F,
scale = 'row',
annotation_col = colData[,c('nfkb','dss'),drop=F],
cellwidth = 15, cellheight = 10,
color = colorRampPalette(c("#6298c8", "white", "#ae3d7f"))(10), cluster_rows = F,
show_colnames = F,
gaps_col = c(8, 16),
main = 'EGFP+ condition signature scores in bulk rnaseq')
xcell <- readRDS(file.path(datadir, 'xCell_signatures.mouse_genenames.RDS'))
# immune cell groups from xcell
immune_cells <- c("B-cells", "CD4+ T-cells",
"CD8+ T-cells", "DC", "Eosinophils", "Macrophages", "Monocytes",
"Mast cells", "Neutrophils", "NK cells")
xcell_scores <- score_gene_sets(gex, xcell[immune_cells])
pheatmap::pheatmap(t(xcell_scores), scale = 'row',
annotation_col = colData[,c('nfkb','dss'),drop=F],
cellwidth = 15, cellheight = 10,
show_colnames = F,
cluster_cols = F,
gaps_col = c(8, 16),
color = colorRampPalette(c("#6298c8", "white", "#ae3d7f"))(10),
main = 'Immune Cell Type Signature Scores in Bulk RNAseq')
print(sessionInfo())
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Berlin
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] Seurat_5.0.0 SeuratObject_5.1.0
## [3] sp_2.2-0 RUVSeq_1.42.0
## [5] edgeR_4.6.2 limma_3.64.1
## [7] EDASeq_2.42.0 ShortRead_1.66.0
## [9] GenomicAlignments_1.44.0 SummarizedExperiment_1.38.1
## [11] MatrixGenerics_1.20.0 matrixStats_1.5.0
## [13] Rsamtools_2.24.0 GenomicRanges_1.60.0
## [15] Biostrings_2.76.0 GenomeInfoDb_1.44.0
## [17] XVector_0.48.0 IRanges_2.42.0
## [19] S4Vectors_0.46.0 BiocParallel_1.42.0
## [21] Biobase_2.68.0 BiocGenerics_0.54.0
## [23] generics_0.1.4 ggridges_0.5.6
## [25] ggrepel_0.9.6 stringr_1.5.1
## [27] gprofiler2_0.2.3 ggpubr_0.6.0
## [29] ggplot2_3.5.2 pheatmap_1.0.12
## [31] data.table_1.17.4 openxlsx_4.2.8
## [33] rmarkdown_2.29
##
## loaded via a namespace (and not attached):
## [1] spatstat.sparse_3.1-0 bitops_1.0-9 httr_1.4.7
## [4] RColorBrewer_1.1-3 tools_4.5.0 sctransform_0.4.2
## [7] backports_1.5.0 R6_2.6.1 lazyeval_0.2.2
## [10] uwot_0.2.3 withr_3.0.2 prettyunits_1.2.0
## [13] gridExtra_2.3 progressr_0.15.1 cli_3.6.5
## [16] spatstat.explore_3.4-3 fastDummies_1.7.5 labeling_0.4.3
## [19] sass_0.4.10 spatstat.data_3.1-6 pbapply_1.7-2
## [22] R.utils_2.13.0 parallelly_1.44.0 RSQLite_2.3.11
## [25] BiocIO_1.18.0 hwriter_1.3.2.1 ica_1.0-3
## [28] spatstat.random_3.4-1 car_3.1-3 dplyr_1.1.4
## [31] zip_2.3.3 Matrix_1.7-3 interp_1.1-6
## [34] abind_1.4-8 R.methodsS3_1.8.2 lifecycle_1.0.4
## [37] yaml_2.3.10 carData_3.0-5 SparseArray_1.8.0
## [40] BiocFileCache_2.16.0 Rtsne_0.17 grid_4.5.0
## [43] blob_1.2.4 promises_1.3.2 crayon_1.5.3
## [46] pwalign_1.4.0 miniUI_0.1.2 lattice_0.22-6
## [49] cowplot_1.1.3 annotate_1.86.0 GenomicFeatures_1.60.0
## [52] KEGGREST_1.48.0 pillar_1.10.2 knitr_1.50
## [55] rjson_0.2.23 future.apply_1.11.3 codetools_0.2-20
## [58] leiden_0.4.3.1 glue_1.8.0 spatstat.univar_3.1-3
## [61] vctrs_0.6.5 png_0.1-8 spam_2.11-1
## [64] gtable_0.3.6 cachem_1.1.0 aroma.light_3.38.0
## [67] xfun_0.52 S4Arrays_1.8.0 mime_0.13
## [70] survival_3.8-3 statmod_1.5.0 fitdistrplus_1.2-2
## [73] ROCR_1.0-11 nlme_3.1-168 bit64_4.6.0-1
## [76] progress_1.2.3 filelock_1.0.3 RcppAnnoy_0.0.22
## [79] bslib_0.9.0 irlba_2.3.5.1 KernSmooth_2.23-26
## [82] DBI_1.2.3 tidyselect_1.2.1 bit_4.6.0
## [85] compiler_4.5.0 curl_6.2.3 graph_1.86.0
## [88] httr2_1.1.2 xml2_1.3.8 DelayedArray_0.34.1
## [91] plotly_4.10.4 rtracklayer_1.68.0 scales_1.4.0
## [94] lmtest_0.9-40 rappdirs_0.3.3 digest_0.6.37
## [97] goftest_1.2-3 spatstat.utils_3.1-4 presto_1.0.0
## [100] htmltools_0.5.8.1 pkgconfig_2.0.3 jpeg_0.1-11
## [103] dbplyr_2.5.0 fastmap_1.2.0 rlang_1.1.6
## [106] htmlwidgets_1.6.4 UCSC.utils_1.4.0 shiny_1.10.0
## [109] farver_2.1.2 jquerylib_0.1.4 zoo_1.8-14
## [112] jsonlite_2.0.0 R.oo_1.27.1 RCurl_1.98-1.17
## [115] magrittr_2.0.3 Formula_1.2-5 GenomeInfoDbData_1.2.14
## [118] dotCall64_1.2 patchwork_1.3.0 Rcpp_1.0.14
## [121] reticulate_1.42.0 stringi_1.8.7 MASS_7.3-65
## [124] plyr_1.8.9 parallel_4.5.0 listenv_0.9.1
## [127] deldir_2.0-4 splines_4.5.0 tensor_1.5
## [130] hms_1.1.3 locfit_1.5-9.12 igraph_2.1.4
## [133] spatstat.geom_3.4-1 ggsignif_0.6.4 RcppHNSW_0.6.0
## [136] reshape2_1.4.4 biomaRt_2.64.0 XML_3.99-0.18
## [139] evaluate_1.0.3 latticeExtra_0.6-30 httpuv_1.6.16
## [142] RANN_2.6.2 tidyr_1.3.1 purrr_1.0.4
## [145] polyclip_1.10-7 future_1.49.0 scattermore_1.2
## [148] broom_1.0.8 xtable_1.8-4 restfulr_0.0.15
## [151] RSpectra_0.16-2 rstatix_0.7.2 later_1.4.2
## [154] viridisLite_0.4.2 singscore_1.28.0 tibble_3.2.1
## [157] memoise_2.0.1 AnnotationDbi_1.70.0 cluster_2.1.8.1
## [160] globals_0.18.0 GSEABase_1.70.0